Kapitel 6.2: Zentralität – Ergebnisse¶

Das Notebook ergänzt Kapitel 6.2 'Zentralität'.

Import¶

In [1]:
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm

from resources_statistics import *
from resources_geschichtslyrik import *

import random
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
In [2]:
pd.set_option('display.max_colwidth', None)
In [3]:
meta = pd.read_json(r"../resources/meta.json")
In [4]:
meta_mode_strikt = pd.read_csv("../resources/more/vectors/mode_strikt.csv", index_col = [0])
meta_mode_flexibel = pd.read_csv("../resources/more/vectors/mode_flexibel.csv", index_col = [0])
meta_mode_strikt1850 = pd.read_csv("../resources/more/vectors/mode_strikt1850.csv", index_col = [0])
In [5]:
features_used_df = pd.read_csv("../resources/more/vectors/vectordist_features.csv", index_col = [0])
meta_all_features = pd.read_csv("../resources/more/vectors/vectordist.csv", index_col = [0])
features_used = features_used_df['feature'].tolist()
In [6]:
dm_manhattan = pd.read_csv("../resources/more/vectors/vectordist_dm_manhattan.csv", index_col = [0])
dm_euclidean = pd.read_csv("../resources/more/vectors/vectordist_dm_euclidean.csv", index_col = [0])
dm_cosine = pd.read_csv("../resources/more/vectors/vectordist_dm_cosine.csv", index_col = [0])
dm_alldistances = pd.read_csv("../resources/more/vectors/vectordist_dm_alldistances.csv", index_col = [0])
In [7]:
dm_manhattan_unweighted = pd.read_csv("../resources/more/vectors/vectordist_dm_manhattan_unweighted.csv", index_col = [0])
dm_euclidean_unweighted = pd.read_csv("../resources/more/vectors/vectordist_dm_euclidean_unweighted.csv", index_col = [0])
dm_cosine_unweighted = pd.read_csv("../resources/more/vectors/vectordist_dm_cosine_unweighted.csv", index_col = [0])
dm_alldistances_unweighted = pd.read_csv("../resources/more/vectors/vectordist_dm_alldistances_unweighted.csv", index_col = [0])
In [8]:
meta_dists = pd.read_csv("../resources/more/vectors/vectordist_dists.csv", index_col = [0])

Korpora¶

In [9]:
meta['count'] = meta.query("corpus=='anth'").groupby('author_title')['author_title'].transform('count')
In [10]:
meta_anth = (
    meta
    .query("corpus=='anth'")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
    .reset_index(drop = True)
)
In [11]:
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']

meta_modcanon = (
    meta
    .query("author in @modcanon_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
    .reset_index(drop = True)
)
In [12]:
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']

meta_muench = (
    meta
    .query("author in @muench_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
    .reset_index(drop = True)
)
In [13]:
meta_all = pd.concat([meta_anth, meta_modcanon, meta_muench])
meta_all = meta_all.drop_duplicates(subset = 'id')
meta_all = meta_all.reset_index(drop = True)

meta_all['korpus_anth'] = [True if x in list(meta_anth['author_title']) else False for x in meta_all['author_title']]
meta_all['korpus_modcanon'] = [True if x in modcanon_authors else False for x in meta_all['author']]
meta_all['korpus_muench'] = [True if x in muench_authors else False for x in meta_all['author']]

meta_all.shape[0]
Out[13]:
2063
In [14]:
print(meta_all.shape[0])
print(meta_mode_strikt.shape[0])
print(meta_mode_flexibel.shape[0])
print(meta_mode_strikt1850.shape[0])
print(meta_all_features.shape[0])
print(meta_dists.shape[0])
2063
2063
2063
2063
2063
2063
In [15]:
meta_all = meta_all.merge(meta_mode_strikt, on = 'id')
meta_all = meta_all.merge(meta_mode_flexibel, on = 'id')
meta_all = meta_all.merge(meta_mode_strikt1850, on = 'id')
meta_all = meta_all.merge(meta_all_features, on = 'id')
meta_all = meta_all.merge(meta_dists, on = 'id')

meta_all.shape[0]
Out[15]:
2063

Berechnung Netzwerk¶

In [16]:
this_dm = dm_manhattan
In [17]:
import networkx as nx
In [18]:
def create_edge_table (distance_matrix, filter_std = 1.5):
    results = distance_matrix.stack().reset_index()
    results.columns = ['text1', 'text2', 'distance']
    
    results['similarity'] = results['distance'] - results['distance'].max()/2
    results['similarity'] = [x * -1 for x in results['similarity']]
    results['similarity'] = results['similarity'] + results['distance'].max()/2

    results = results.loc[
        results['text1'] != results['text2']
    ]
    
    if filter_std:
        results = results.loc[
            results['similarity'] > results['similarity'].mean() + filter_std * results['similarity'].std()
        ]
    
    return results
In [19]:
this_meta = meta_all.query("korpus_anth")
this_ids = this_meta['id']
this_index = this_meta.index
In [20]:
edge_table = create_edge_table(this_dm.loc[this_ids, this_ids], filter_std = False)
G = nx.from_pandas_edgelist(edge_table, 'text1', 'text2', ['distance', 'similarity'])
In [21]:
eigenvector_centrality_dic = nx.eigenvector_centrality(G, weight = 'similarity')
meta_all.loc[this_index, 'eigenvector_centrality'] = list(eigenvector_centrality_dic.values())

Zusammenhänge¶

In [22]:
meta_plot = meta_all.query("korpus_anth")

meta_plot = meta_plot.rename(columns={
    'mode_score_strikt' : 'Modus (strikt)',
    'dist_mean_euclidean' : '<br>Mittelwert Distanzen (euclidean, gewichtet)'
})

fig = px.box(
    meta_plot,
    x = 'Modus (strikt)',
    y = '<br>Mittelwert Distanzen (euclidean, gewichtet)',
    points = 'all',
    hover_data = ['id', 'author', 'title',]
)

fig.update_layout(
    width = 1000, height = 600,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)
fig.write_image(f"plots/6.2 Zusammenhang zweier Verfahrensvarianten zur Messung von Zentralität.pdf")
fig.show()
In [23]:
meta_all[[
    'mode_score_strikt', 'mode_score_flexibel', 
    
    'dist_centroid_manhattan_unweighted', 'dist_centroid_euclidean_unweighted', 
    'dist_centroid_cosine_unweighted', 'dist_centroid_alldistances_unweighted',
    'dist_centroid_manhattan', 'dist_centroid_euclidean', 'dist_centroid_cosine', 'dist_centroid_alldistances',
    
    'dist_mean_manhattan', 'dist_mean_euclidean', 'dist_mean_cosine', 'dist_mean_alldistances',
    'dist_mean_manhattan_unweighted', 'dist_mean_euclidean_unweighted', 'dist_mean_cosine_unweighted', 
    'dist_mean_alldistances_unweighted',
    
    'eigenvector_centrality'
]].corr()
Out[23]:
mode_score_strikt mode_score_flexibel dist_centroid_manhattan_unweighted dist_centroid_euclidean_unweighted dist_centroid_cosine_unweighted dist_centroid_alldistances_unweighted dist_centroid_manhattan dist_centroid_euclidean dist_centroid_cosine dist_centroid_alldistances dist_mean_manhattan dist_mean_euclidean dist_mean_cosine dist_mean_alldistances dist_mean_manhattan_unweighted dist_mean_euclidean_unweighted dist_mean_cosine_unweighted dist_mean_alldistances_unweighted eigenvector_centrality
mode_score_strikt 1.000000 0.952823 -0.900802 -0.914699 -0.888176 -0.914241 -0.908581 -0.903217 -0.882121 -0.911031 -0.908174 -0.906328 -0.887145 -0.911418 -0.904079 -0.919082 -0.891435 -0.915807 0.914046
mode_score_flexibel 0.952823 1.000000 -0.887655 -0.897019 -0.896656 -0.906832 -0.877581 -0.871240 -0.878015 -0.886252 -0.877557 -0.873846 -0.880373 -0.887011 -0.887584 -0.901716 -0.898782 -0.907626 0.884781
dist_centroid_manhattan_unweighted -0.900802 -0.887655 1.000000 0.979699 0.940538 0.986774 0.880244 0.869071 0.866409 0.883292 0.881619 0.870721 0.869441 0.884200 0.998595 0.980713 0.942623 0.986269 -0.881673
dist_centroid_euclidean_unweighted -0.914699 -0.897019 0.979699 1.000000 0.952941 0.991597 0.896486 0.901334 0.874381 0.904018 0.899069 0.901863 0.878798 0.903991 0.983311 0.999396 0.955470 0.990811 -0.901725
dist_centroid_cosine_unweighted -0.888176 -0.896656 0.940538 0.952941 1.000000 0.979029 0.860968 0.856909 0.918110 0.885118 0.862470 0.857055 0.916545 0.886616 0.942132 0.954520 0.999925 0.979573 -0.864334
dist_centroid_alldistances_unweighted -0.914241 -0.906832 0.986774 0.991597 0.979029 1.000000 0.891894 0.888581 0.899602 0.903857 0.893752 0.889345 0.901565 0.904659 0.988128 0.992260 0.980548 0.999754 -0.895228
dist_centroid_manhattan -0.908581 -0.877581 0.880244 0.896486 0.860968 0.891894 1.000000 0.989582 0.947129 0.994697 0.998880 0.991083 0.954510 0.994123 0.880928 0.901617 0.865158 0.892813 -0.997903
dist_centroid_euclidean -0.903217 -0.871240 0.869071 0.901334 0.856909 0.888581 0.989582 1.000000 0.938454 0.992968 0.990802 0.999380 0.946257 0.991381 0.871650 0.904848 0.861245 0.888947 -0.991212
dist_centroid_cosine -0.882121 -0.878015 0.866409 0.874381 0.918110 0.899602 0.947129 0.938454 1.000000 0.969403 0.948296 0.938923 0.999565 0.971236 0.866761 0.879664 0.919263 0.901389 -0.946271
dist_centroid_alldistances -0.911031 -0.886252 0.883292 0.904018 0.885118 0.903857 0.994697 0.992968 0.969403 1.000000 0.995098 0.993368 0.975010 0.999639 0.884659 0.908604 0.888628 0.904792 -0.994301
dist_mean_manhattan -0.908174 -0.877557 0.881619 0.899069 0.862470 0.893752 0.998880 0.990802 0.948296 0.995098 1.000000 0.992528 0.955772 0.995407 0.883703 0.904222 0.866736 0.895136 -0.999300
dist_mean_euclidean -0.906328 -0.873846 0.870721 0.901863 0.857055 0.889345 0.991083 0.999380 0.938923 0.993368 0.992528 1.000000 0.946930 0.992440 0.873482 0.906108 0.861505 0.890062 -0.993384
dist_mean_cosine -0.887145 -0.880373 0.869441 0.878798 0.916545 0.901565 0.954510 0.946257 0.999565 0.975010 0.955772 0.946930 1.000000 0.976854 0.870009 0.884124 0.918031 0.903375 -0.954167
dist_mean_alldistances -0.911418 -0.887011 0.884200 0.903991 0.886616 0.904659 0.994123 0.991381 0.971236 0.999639 0.995407 0.992440 0.976854 1.000000 0.886078 0.908950 0.890155 0.905964 -0.994878
dist_mean_manhattan_unweighted -0.904079 -0.887584 0.998595 0.983311 0.942132 0.988128 0.880928 0.871650 0.866761 0.884659 0.883703 0.873482 0.870009 0.886078 1.000000 0.984388 0.944320 0.988446 -0.883459
dist_mean_euclidean_unweighted -0.919082 -0.901716 0.980713 0.999396 0.954520 0.992260 0.901617 0.904848 0.879664 0.908604 0.904222 0.906108 0.884124 0.908950 0.984388 1.000000 0.957110 0.991975 -0.907192
dist_mean_cosine_unweighted -0.891435 -0.898782 0.942623 0.955470 0.999925 0.980548 0.865158 0.861245 0.919263 0.888628 0.866736 0.861505 0.918031 0.890155 0.944320 0.957110 1.000000 0.981126 -0.868728
dist_mean_alldistances_unweighted -0.915807 -0.907626 0.986269 0.990811 0.979573 0.999754 0.892813 0.888947 0.901389 0.904792 0.895136 0.890062 0.903375 0.905964 0.988446 0.991975 0.981126 1.000000 -0.896486
eigenvector_centrality 0.914046 0.884781 -0.881673 -0.901725 -0.864334 -0.895228 -0.997903 -0.991212 -0.946271 -0.994301 -0.999300 -0.993384 -0.954167 -0.994878 -0.883459 -0.907192 -0.868728 -0.896486 1.000000
In [24]:
meta_plot = round(meta_all[[
    'mode_score_strikt', 'mode_score_flexibel', 

    'dist_mean_manhattan', 'dist_mean_euclidean', 'dist_mean_cosine', 'dist_mean_alldistances',
    'dist_mean_manhattan_unweighted', 'dist_mean_euclidean_unweighted', 'dist_mean_cosine_unweighted', 
    'dist_mean_alldistances_unweighted',
    
    'dist_centroid_manhattan', 
    'eigenvector_centrality'
]].corr(), 3).abs()

category_dic = {
    'mode_score_strikt' : 'Modus (strikt) ',
    'mode_score_flexibel' : 'Modus (flexibel) ',
    'dist_mean_manhattan' : 'Mittelwert Distanzen (manhattan, gewichtet) ',
    'dist_mean_euclidean' : 'Mittelwert Distanzen (euclidean, gewichtet) ',
    'dist_mean_cosine' : 'Mittelwert Distanzen (cosine, gewichtet) ',
    'dist_mean_alldistances' : 'Mittelwert Distanzen (alle, gewichtet) ',
    'dist_mean_manhattan_unweighted' : 'Mittelwert Distanzen (manhattan, ungewichtet) ',
    'dist_mean_euclidean_unweighted' : 'Mittelwert Distanzen (euclidean, ungewichtet) ',
    'dist_mean_cosine_unweighted' : 'Mittelwert Distanzen (cosine, ungewichtet) ',
    'dist_mean_alldistances_unweighted' : 'Mittelwert Distanzen (alle, ungewichtet) ',
    'dist_centroid_manhattan' : 'Distanz zum Zentroid (manhattan, gewichtet) ',
    'eigenvector_centrality' : 'Eigenvektorzentralität (manhattan, gewichtet) ',
}

meta_plot = meta_plot.rename(columns=category_dic, index=category_dic)

fig = px.imshow(
    meta_plot, 
    text_auto=True, 
    aspect = "auto",
    zmin=0.8,
)
fig.update_layout(
    width = 1600, height = 800,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    font=dict(size=20),
    # showlegend = False
)
fig.write_image(f"plots/6.2 Korrelation verschiedener Verfahrensvarianten zur Messung von Zentralität.pdf")
fig.show()

Grundlegende Ergebnisse¶

Zentrale Merkmale (Modus_strikt)¶

In [25]:
authortitle = [meta.query("id == @x")['author_title'].tolist()[0] for x in meta_mode_strikt['id']]
korpus_anth = [True if x in meta_anth['author_title'].tolist() else False for x in authortitle]

meta_mode_strikt['korpus_anth'] = korpus_anth

(meta_mode_strikt
 .query("korpus_anth")
 .drop(["id", "mode_score_strikt", "missing_from_mode_strikt", "korpus_anth"], axis = 1)
).mean().round(2)
Out[25]:
strikt_Geschichtslyrik                                        1.00
strikt_empirisch                                              1.00
strikt_nicht theoretisch                                      0.97
strikt_Ballade (exakt)                                        0.54
strikt_Sprechinstanz nicht markiert                           0.56
strikt_Sprechinstanz Zeit unklar                              0.56
strikt_Erzählen (exakt)                                       0.57
strikt_Präsens und Präiteritum                                0.42
strikt_Konkretheit                                            0.82
strikt_keine Positionierung zum Wissen                        0.91
strikt_vergangenheitsomdinant                                 0.82
strikt_2 Zeitebenen (exakt)                                   0.46
strikt_zeitlich fixierbar                                     0.64
strikt_Beginn 1870                                            0.05
strikt_Ende 1870                                              0.04
strikt_keine Anachronismen                                    0.94
strikt_kein Gegenwartsbezug                                   0.71
strikt_Europa (exakt)                                         0.95
strikt_Heiliges Römisches Reich (exakt)                       0.41
strikt_Kein Kleinraum                                         0.39
strikt_Handlung (exakt)                                       0.58
strikt_Krieg (exakt)                                          0.28
strikt_positive Bewertung von Krieg (exakt)                   0.15
strikt_bekanntes Individuum (exakt)                           0.15
strikt_positive Bewertung von bekanntem Individuum (exakt)    0.10
strikt_kein Nationalismus                                     0.86
strikt_kein Heroismus                                         0.72
strikt_keine Religiosität                                     0.83
strikt_Personen-Marker (Titel + Text)                         0.46
strikt_keine Zeit-/Geschichts-Marker                          0.58
strikt_keine Ort-Marker                                       0.72
strikt_Objekt-/Institutionen-Marker (Text)                    0.55
strikt_kein Bezug auf Überlieferung                           0.77
strikt_keine Bewertung von Überlieferung                      0.77
strikt_kein Bezug auf Geschichtsauffassungen                  0.96
strikt_keine Bewertung von Geschichtsauffassungen             0.96
strikt_Ergänzung des Geschichtswissens                        0.75
strikt_Reim                                                   0.95
strikt_regelmäßiges Metrum                                    0.98
strikt_keine verfremdende Sprache                             0.99
dtype: float64

Korpora¶

In [26]:
meta_plot = pd.concat([
    meta_all.query("korpus_anth"),
    meta_all.query("korpus_modcanon"),
    meta_all.query("korpus_muench")
])
meta_plot['korpus'] = ['anth' if x['korpus_anth'] else 'modcanon' if x['korpus_modcanon'] else 'muench' for x in meta_plot.iloc]

px.box(
    meta_plot,
    y = 'mode_score_strikt',
    color = 'korpus',
    points = 'all',
    hover_data = ['author', 'title']
)

Zusammenhang mit Zahl der Abdrucke¶

In [27]:
meta_all['count_min5'] = [1 if x >= 5 else 0 for x in meta_all['count']]
meta_all['count_min10'] = [1 if x >= 10 else 0 for x in meta_all['count']]
In [28]:
meta_all.query("corpus=='anth'")[[
    'count', 'count_min5', 'count_min10',
    'mode_score_strikt', 'dist_mean_alldistances'
]].corr()
Out[28]:
count count_min5 count_min10 mode_score_strikt dist_mean_alldistances
count 1.000000 0.796814 0.778227 0.067258 -0.059125
count_min5 0.796814 1.000000 0.511927 0.067102 -0.058313
count_min10 0.778227 0.511927 1.000000 0.035281 -0.039871
mode_score_strikt 0.067258 0.067102 0.035281 1.000000 -0.912061
dist_mean_alldistances -0.059125 -0.058313 -0.039871 -0.912061 1.000000
In [29]:
stats.pearsonr(
    meta_all.query("corpus=='anth'")['mode_score_strikt'], 
    meta_all.query("corpus=='anth'")['count']
)
Out[29]:
PearsonRResult(statistic=0.06725808687682228, pvalue=0.003801235216825594)
In [30]:
stats.pearsonr(
    meta_all.query("corpus=='anth'")['dist_mean_alldistances'], 
    meta_all.query("corpus=='anth'")['count']
)
Out[30]:
PearsonRResult(statistic=-0.0591246699114598, pvalue=0.010973088602442877)
In [31]:
meta_plot = meta_all.query("corpus=='anth'").copy()
meta_plot['count_min5'] = meta_plot['count_min5'].replace({0 : 'unter 5', 1: '5 oder mehr'})

px.box(
    meta_plot.sort_values(by='count_min5', ascending=False),
    x = 'count_min5',
    y = 'mode_score_strikt',
    points = 'all',
    hover_data = ['author', 'title'],
    labels = {'count_min5' : 'Vorkommen im Anthologiekorpus',
              'mode_score_strikt' : 'Modus (strikt)'
             }
)
In [32]:
stats.ttest_ind(
    meta_all.query("corpus=='anth' and count_min5 == 1")['mode_score_strikt'],
    meta_all.query("corpus=='anth' and count_min5 == 0")['mode_score_strikt']
)
Out[32]:
TtestResult(statistic=2.891123789799813, pvalue=0.003883510462262761, df=1848.0)
In [33]:
get_cohens_d(
    meta_all.query("corpus=='anth' and count_min5 == 1")['mode_score_strikt'],
    meta_all.query("corpus=='anth' and count_min5 == 0")['mode_score_strikt']
)
Out[33]:
0.26727065034081565

Texte¶

In [34]:
meta_all['words'] = [len(' '.join(x).split(" ")) if str(x) != 'None' else x for x in meta_all['text_bestocr']]
In [35]:
(meta_all.query("korpus_anth")[[
    "author", "title", "year", "count", "mode_score_strikt", 'words', "missing_from_mode_strikt",
]]
 .sort_values(by = "author")
 .sort_values(by = "mode_score_strikt", ascending = False)
 .query("mode_score_strikt >= 34")
)
Out[35]:
author title year count mode_score_strikt words missing_from_mode_strikt
780 Weinholz, Albert Otto von Wittelsbachs Bergfahrt 1858.0 1.0 35.0 648.0 strikt_Beginn 1870 + strikt_Ende 1870 + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt) + strikt_kein Heroismus
1094 Meyer, Conrad Ferdinand Die Schweizer des Herrn von Tremouille 1875.0 2.0 35.0 338.0 strikt_2 Zeitebenen (exakt) + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt)
129 Priem, Johann Paul Der Schneidergeneral. 1. Der Rekrut 1858.0 1.0 34.0 NaN strikt_2 Zeitebenen (exakt) + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt) + strikt_Personen-Marker (Titel + Text)
1626 Lingg, Hermann Heinrich der Finkler 1870.0 1.0 34.0 238.0 strikt_Präsens und Präiteritum + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_Heiliges Römisches Reich (exakt) + strikt_Krieg (exakt) + strikt_positive Bewertung von Krieg (exakt)
292 Brunold, Friedrich König Christian I. von Dänemark und Henning Wulf 1859.0 2.0 34.0 274.0 strikt_2 Zeitebenen (exakt) + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_positive Bewertung von Krieg (exakt) + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt)
927 Liliencron, Detlev von Wibke Pogwisch 1889.0 1.0 34.0 407.0 strikt_Beginn 1870 + strikt_Ende 1870 + strikt_positive Bewertung von Krieg (exakt) + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt) + strikt_Personen-Marker (Titel + Text)
833 Helmers, Heinrich Maria Theresia in Preßburg 1887.0 1.0 34.0 290.0 strikt_Erzählen (exakt) + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_Kein Kleinraum + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt)
1740 Richter, Paul Brusehawer 1908.0 1.0 34.0 369.0 strikt_Beginn 1870 + strikt_Ende 1870 + strikt_positive Bewertung von Krieg (exakt) + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt) + strikt_Personen-Marker (Titel + Text)
In [36]:
meta_all.query("author.str.contains('Priem')")[[
    'author', 'title', 'mode_score_strikt', 'dist_mean_euclidean'
]]
Out[36]:
author title mode_score_strikt dist_mean_euclidean
129 Priem, Johann Paul Der Schneidergeneral. 1. Der Rekrut 34.0 2.021937
130 Priem, Johann Paul Der Schneidergeneral. 2. Der Sieg von Stralsund 31.0 2.237389
In [37]:
(meta_all.query("korpus_anth")[[
    "author", "title", "year", "count", "mode_score_strikt", 'words', # "missing_from_mode_strikt",
]]
 .sort_values(by = "author")
 .sort_values(by = "mode_score_strikt", ascending = True)
 .query("mode_score_strikt <= 13")
)
Out[37]:
author title year count mode_score_strikt words
650 Jahn, Franz Erfüllung 1870.0 3.0 11.0 132.0
620 Meyer, Johannes Der deutschen Jugend 1881.0 1.0 12.0 180.0
1284 Schack, Adolf Friedrich Graf von Rast bei Milet 1866.0 1.0 12.0 187.0
1309 Lingg, Hermann Pompeji 1854.0 1.0 13.0 174.0
454 Schrott, Johannes König Ludwig I. 1866.0 1.0 13.0 540.0
121 Niedergesäß, Robert Es treibet ohne Rast und Ruh 1859.0 1.0 13.0 NaN
1682 Fallersleben, Heinrich Hoffmann von Weltgeschichte 1871.0 1.0 13.0 69.0
1103 Rosegger, Peter Ein Blättchen Papier 1875.0 1.0 13.0 153.0

Autor:innen¶

In [38]:
meta_plot = meta_all.query("korpus_anth").copy()

author_counts = meta_plot['author'].value_counts()
valid_authors = author_counts[author_counts >= 10].index
meta_plot = meta_plot.query("author.isin(@valid_authors)")
In [39]:
results = meta_plot.groupby('author').mean(numeric_only=True)

results.sort_values(by='mode_score_strikt', ascending=False)[['mode_score_strikt']]
Out[39]:
mode_score_strikt
author
Böttger, Adolf 30.400000
Schrutz, Demetrius 29.000000
Müller von Königswinter, Wolfgang 28.652174
Krais, Julius 28.500000
Geißler, Max 28.363636
Stöber, Adolf 27.800000
Gruppe, Otto Friedrich 27.690476
Meyern, Gustav von 27.583333
Sturm, Julius 27.550000
Frey, Adolf 27.500000
Gaudy, Alice von 27.333333
Netz, Karl Ludwig 26.900000
Kirchner, Friedrich 26.846154
Meyer, Conrad Ferdinand 26.720000
Wickenburg, Albrecht von 26.615385
Weilen, Josef von 26.538462
Kaufmann, Alexander 26.083333
Liliencron, Detlev von 26.058824
Lissauer, Ernst 25.666667
Schults, Adolf 25.363636
Dahn, Felix 25.301587
Münchhausen, Börries von 25.235294
Hesekiel, George 25.181818
Fontane, Theodor 25.153846
Schack, Adolf Friedrich Graf von 25.000000
Lingg, Hermann 24.987342
Scheffel, Joseph Viktor von 24.923077
Greif, Martin 24.892857
Stieler, Karl 24.823529
Möser, Albert 24.677966
Vierordt, Heinrich 24.612903
Gerok, Karl 23.833333
Geibel, Emanuel 23.722222
Wildenbruch, Ernst von 23.416667

Zeitverlauf und Korpusvergleich¶

In [40]:
queries_a = {
    # "korpus_anth" : 'Anthologiekorpus',
    "korpus_anth and decade == 1850" : '1850er',
    "korpus_anth and decade == 1860" : '1860er',
    "korpus_anth and decade == 1870" : '1870er',
    "korpus_anth and decade == 1880" : '1880er',
    "korpus_anth and decade == 1890" : '1890er',
    "korpus_anth and decade == 1900" : '1900er',
    "korpus_anth and decade == 1910" : '1910er',
    "korpus_modcanon" : 'Kanonisierte Moderne',
    "korpus_muench" : 'Münchhausen-Kreis'
}
In [41]:
queries_b = {
    "korpus_anth and 1850 <= year <= 1854" : '1850–1854',
    "korpus_anth and 1855 <= year <= 1859" : '1855–1859',
    "korpus_anth and 1860 <= year <= 1864" : '1860–1864',
    "korpus_anth and 1865 <= year <= 1869" : '1865–1869',
    "korpus_anth and 1870 <= year <= 1874" : '1870–1874',
    "korpus_anth and 1875 <= year <= 1879" : '1875–1879',
    "korpus_anth and 1880 <= year <= 1884" : '1880–1884',
    "korpus_anth and 1885 <= year <= 1889" : '1885–1889',
    "korpus_anth and 1890 <= year <= 1894" : '1890–1894',
    "korpus_anth and 1895 <= year <= 1899" : '1895–1899',
    "korpus_anth and 1900 <= year <= 1904" : '1900–1904',
    "korpus_anth and 1905 <= year <= 1909" : '1905–1909',
    "korpus_anth and 1910 <= year <= 1914" : '1910–1914',
    "korpus_anth and 1915 <= year <= 1918" : '1915–1918',
    "korpus_modcanon" : 'Kanonisierte Moderne',
    "korpus_muench" : 'Münchhausen-Kreis'
}

Anthologiekorpus 1850er, Modus¶

In [42]:
meta_plot = pd.DataFrame()

for corpus_query in queries_a:
    this_meta = meta_all.query(corpus_query)
    this_distances = this_meta['mode_score_strikt1850']

    meta_add = pd.DataFrame()
    meta_add[['author', 'title']] = this_meta[['author', 'title']]
    meta_add['corpus'] = queries_a[corpus_query]
    meta_add['dist'] = this_distances.tolist()
    
    meta_plot = pd.concat([meta_plot, meta_add])
    
fig = px.box(
    meta_plot,
    x = 'corpus',
    y = 'dist',
    # points = 'all',
    hover_data = ['author', 'title'],
    labels = {'dist' : '<br>Modus_1850 (strikt)', 'corpus' : ''}
)

fig.update_layout(
    width = 1000, height = 600,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)
fig.write_image(f"plots/6.2 Moduswerte je Textgruppe, Bezugskorpus: Anthologietexte der 1850er.pdf")
fig.show()
In [43]:
meta_plot = pd.DataFrame()

for corpus_query in queries_b:
    this_meta = meta_all.query(corpus_query)
    this_distances = this_meta['mode_score_strikt1850']

    meta_add = pd.DataFrame()
    meta_add[['author', 'title']] = this_meta[['author', 'title']]
    meta_add['corpus'] = queries_b[corpus_query]
    meta_add['dist'] = this_distances.tolist()
    
    meta_plot = pd.concat([meta_plot, meta_add])
    
fig = px.box(
    meta_plot,
    x = 'corpus',
    y = 'dist',
    # points = 'all',
    hover_data = ['author', 'title'],
    labels = {'dist' : '<br>Modus_1850 (strikt)', 'corpus' : ''}
)

fig.update_layout(
    width = 1000, height = 800,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)

fig.show()

Anthologiekorpus_1850er, Distanzen¶

In [44]:
this_dm = dm_alldistances
In [45]:
meta_1850 = meta_all.query("korpus_anth and 1850 <= year <= 1859")
In [46]:
meta_plot = pd.DataFrame()

for corpus_query in queries_a:
    this_meta = meta_all.query(corpus_query)
    this_corpus_dm = this_dm.loc[meta_1850.id, this_meta.id]
    this_distances = this_corpus_dm.mean()

    meta_add = pd.DataFrame()
    meta_add[['author', 'title', 'year', 'count', 'mode_score_strikt1850']] = this_meta[[
        'author', 'title', 'year', 'count', 'mode_score_strikt1850'
    ]]
    meta_add['corpus'] = queries_a[corpus_query]
    meta_add['dist'] = this_distances.tolist()
    
    meta_plot = pd.concat([meta_plot, meta_add])
    
fig = px.box(
    meta_plot,
    x = 'corpus',
    y = 'dist',
    # points = 'all',
    hover_data = ['author', 'title'],
    labels = {'dist' : '<br>Mittelwert Distanzen_1850<br>(alle, gewichtet)', 'corpus' : ''}
)

fig.update_layout(
    width = 1000, height = 600,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)
fig.write_image(f"plots/6.2 Distanzwerte je Textgruppe, Bezugskorpus: Anthologietexte der 1850er.pdf")
fig.show()
In [47]:
meta_plot = pd.DataFrame()

for corpus_query in queries_b:
    this_meta = meta_all.query(corpus_query)
    this_corpus_dm = this_dm.loc[meta_1850.id, this_meta.id]
    this_distances = this_corpus_dm.mean()

    meta_add = pd.DataFrame()
    meta_add[['author', 'title', 'year', 'count', 'mode_score_strikt1850']] = this_meta[[
        'author', 'title', 'year', 'count', 'mode_score_strikt1850'
    ]]
    meta_add['corpus'] = queries_b[corpus_query]
    meta_add['dist'] = this_distances.tolist()
    
    meta_plot = pd.concat([meta_plot, meta_add])
    
fig = px.box(
    meta_plot,
    x = 'corpus',
    y = 'dist',
    # points = 'all',
    hover_data = ['author', 'title'],
    labels = {'dist' : '<br>Mittelwert Distanzen_1850<br>(alle, gewichtet)', 'corpus' : ''}
)

fig.update_layout(
    width = 1000, height = 800,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)

fig.show()

Zusammenhang (im Anthologiekorpus)?¶

In [48]:
meta_corr = meta_all.query("corpus=='anth'").copy()
meta_corr_dm = dm_alldistances.loc[meta_1850.id, meta_corr.id]
meta_corr_distances = meta_corr_dm.mean()
meta_corr['dist'] = meta_corr_distances.tolist()
In [49]:
meta_corr[[
    'year',
    'mode_score_strikt1850', 'dist', 
]].corr() # .round(2)
Out[49]:
year mode_score_strikt1850 dist
year 1.000000 -0.118972 0.113978
mode_score_strikt1850 -0.118972 1.000000 -0.929779
dist 0.113978 -0.929779 1.000000
In [50]:
# Zusammenhang mode_score_strikt1850 (r, p)
years = meta_corr['year']
centrality = meta_corr['mode_score_strikt1850']
r, p_value = stats.pearsonr(years, centrality)
print(f"{r} / {round(p_value, 10)}")
-0.11897204582514027 / 2.867e-07
In [51]:
# Zusammenhang dist_mean_alldistances (r, p)
years = meta_corr['year']
centrality = meta_corr['dist']
r, p_value = stats.pearsonr(years, centrality)
print(f"{r} / {round(p_value, 10)}")
0.11397785048239731 / 8.874e-07
In [52]:
fig = px.scatter(
    meta_corr,
    x = 'year',
    y = 'dist',
    hover_data = ['author', 'title'],
    trendline = 'ols',
    labels = {'dist': '<br>Mittelwert Distanzen_1850<br>(alle, gewichtet)',
              'year' : ''
             }
)
fig.show()

Dimensionsreduktion¶

Features für Plot-Einfärbung¶

In [53]:
for i, gattung in enumerate(meta_all['gattung']):
    if gattung == 'Ballade':
        meta_all.at[i, 'gattung_color'] = 'Ballade'
    elif gattung == 'Lied':
        meta_all.at[i, 'gattung_color'] = 'Lied'
    elif gattung == 'Denkmal-/Ruinenpoesie':
        meta_all.at[i, 'gattung_color'] = 'Denkmal-/Ruinenpoesie'
    elif gattung == 'Sonett':
        meta_all.at[i, 'gattung_color'] = 'Sonett'
    elif gattung == 'Rollengedicht':
        meta_all.at[i, 'gattung_color'] = 'Rollengedicht'
    elif ' + ' in str(gattung):
        meta_all.at[i, 'gattung_color'] = '[mehrere annotierte Gattungen]'
    else:
        meta_all.at[i, 'gattung_color'] = '[keine annotierte Gattung]'
    
meta_all['gattung_color_order'] = meta_all['gattung_color'].replace({
    '[keine annotierte Gattung]' : 0,
    '[mehrere annotierte Gattungen]' : 1, 
    'Ballade' : 2,
    'Rollengedicht' : 3,
    'Denkmal-/Ruinenpoesie' : 4,
    'Lied' : 5,
    'Sonett' : 6
})
/var/folders/45/zsyytpq97xq280z_cvw88j240000gn/T/ipykernel_6439/1334992829.py:17: FutureWarning:

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

In [54]:
# for i, (sprechinstanz_markiert, vergangenheitsdominant) in enumerate(zip(meta_all['sprechinstanz_markiert'], meta_all['vergangenheitsdominant'])):
#     if sprechinstanz_markiert == 1 and vergangenheitsdominant == 1:
#         meta_all.at[i, 'vergangenheitsdominant'] = 'Sprechinstanz markiert +<br>vergangenheitsdominant'
#     elif sprechinstanz_markiert == 1 and vergangenheitsdominant != 1:
#         meta_all.at[i, 'vergangenheitsdominant'] = 'Sprechinstanz markiert +<br>nicht vergangenheitsdomainant'
#     elif sprechinstanz_markiert == 0 and vergangenheitsdominant == 1:
#         meta_all.at[i, 'sprechinstanz_vergangenheitsdominant'] = 'Sprechinstanz nicht markiert +<br>vergangenheitsdominant'
#     else:
#         meta_all.at[i, 'sprechinstanz_vergangenheitsdominant'] = 'Sprechinstanz nicht markiert +<br>nicht vergangenheitsdomainant'
In [55]:
for i, vergangenheitsdominant in enumerate(meta_all['vergangenheitsdominant']):
    if vergangenheitsdominant == 1:
        meta_all.at[i, 'vergangenheitsdominant'] = 'vergangenheitsdominant'
    else:
        meta_all.at[i, 'vergangenheitsdominant'] = 'nicht vergangenheitsdominant'
/var/folders/45/zsyytpq97xq280z_cvw88j240000gn/T/ipykernel_6439/2378240172.py:3: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'vergangenheitsdominant' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.

In [56]:
rating_table = get_rating_table(meta_all, mode = 'themes')
rating_table['author_title'] = rating_table['author'] + ' – ' + rating_table['title']

for i, author_title in enumerate(meta_all['author_title']):
    this_ratings = rating_table.query("author_title == @author_title")
        
    if 'Militär/Krieg' in this_ratings['type'].tolist():
        if this_ratings.query("type=='Militär/Krieg'")['rating'].tolist()[0] == '1':
            meta_all.at[i, 'Militär/Krieg'] = 'behandelt und positiv bewertet'
        else:
            meta_all.at[i, 'Militär/Krieg'] = 'behandelt und negativ bewertet'
    else:
        meta_all.at[i, 'Militär/Krieg'] = 'nicht behandelt'
In [57]:
for i, element in enumerate(meta_all.iloc):
    if element.korpus_anth and element.korpus_modcanon == False and element.korpus_muench == False:
        meta_all.at[i, 'korpus_color'] = 'Anthologiekorpus'
    elif element.korpus_anth == False and element.korpus_modcanon and element.korpus_muench == False:
        meta_all.at[i, 'korpus_color'] = 'Kanonisierte Moderne'
    elif element.korpus_anth == False and element.korpus_modcanon == False and element.korpus_muench:
        meta_all.at[i, 'korpus_color'] = 'Münchhausen-Kreis'
    else:
        meta_all.at[i, 'korpus_color'] = '[Mehrere Korpora]'
        
meta_all['korpus_color_order'] = meta_all['korpus_color'].replace({
    'Anthologiekorpus' : 0,
    'Kanonisierte Moderne' : 1,
    'Münchhausen-Kreis' : 2,
    '[Mehrere Korpora]' : 3
})
/var/folders/45/zsyytpq97xq280z_cvw88j240000gn/T/ipykernel_6439/2452535144.py:11: FutureWarning:

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

In [58]:
for i, element in enumerate(meta_all.iloc):
    if element.korpus_anth and element.korpus_modcanon == False and element.korpus_muench == False:
        meta_all.at[i, 'korpus_color'] = 'Anthologiekorpus'
    elif element.korpus_anth == False and element.korpus_modcanon and element.korpus_muench == False:
        meta_all.at[i, 'korpus_color'] = 'Kanonisierte Moderne'
    elif element.korpus_anth == False and element.korpus_modcanon == False and element.korpus_muench:
        meta_all.at[i, 'korpus_color'] = 'Münchhausen-Kreis'
    else:
        meta_all.at[i, 'korpus_color'] = '[Mehrere Korpora]'
        
meta_all['korpus_color_order'] = meta_all['korpus_color'].replace({
    'Anthologiekorpus' : 0,
    'Kanonisierte Moderne' : 1,
    'Münchhausen-Kreis' : 2,
    '[Mehrere Korpora]' : 3
})
/var/folders/45/zsyytpq97xq280z_cvw88j240000gn/T/ipykernel_6439/2452535144.py:11: FutureWarning:

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

In [59]:
meta_all['anthology_decade'] = [(x//10)*10 if pd.isna(x) == False else float('NaN') for x in meta_all['anthology_year_used_ed']]

UMAP¶

In [60]:
this_dm = dm_alldistances
In [61]:
import umap.umap_ as umap

n_components = 2

model = umap.UMAP(
    n_components = n_components,
    metric = 'precomputed',
    random_state=0,
)
In [62]:
column_names = ['umap_dim_' + str(i+1) for i in range(n_components)]
In [63]:
meta_all = meta_all.copy()
meta_all[column_names] = model.fit_transform(this_dm.loc[meta_all['id'], meta_all['id']])
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/umap/umap_.py:1865: UserWarning:

using precomputed metric; inverse_transform will be unavailable

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/umap/umap_.py:1952: UserWarning:

n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

In [64]:
meta_plot = meta_all.copy()

fig = px.scatter(
    meta_plot,
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : ''}
)

fig.update_traces(marker={'size': 6})
fig.update_layout(
    width = 1000, height = 500,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.write_image(f"plots/6.2 Zweidimensionale Projektion.pdf")
fig.show()
In [65]:
fig = px.scatter(
    meta_plot.sort_values(by = 'decade', ascending = True),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'year',
    # color_discrete_sequence=['Black', 'yellow', 'lightgreen', 'red'],
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'year' : ''}
)
fig.update_layout(
    width = 1000, height = 500,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_traces(marker={'size': 6})
fig.write_image(f"plots/6.2 Zweidimensionale Projektion (Jahre).pdf")
fig.show()
In [66]:
meta_plot = meta_all.copy()
    
fig = px.scatter(
    meta_plot.sort_values(by = 'korpus_color_order'),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'korpus_color',
    color_discrete_sequence=['Black', 'yellow', 'lightgreen', 'grey'],
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'korpus_color' : 'Korpus'}
)

fig.update_layout(
    width = 1000, height = 500,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_traces(marker={'size': 6})
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.write_image(f"plots/6.2 Zweidimensionale Projektion (Korpora).pdf")
fig.show()
In [67]:
meta_plot = meta_all.copy()
    
fig = px.scatter(
    meta_plot.query("anthology_decade < 1950"),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'anthology_year_used_ed',
    hover_data = ['author', 'title', 'gattung', 'anthology'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'anthology_year_used_ed' : 'Erscheinungsdatum<br>Anthologie'}
)

fig.update_layout(
    # width = 1000, height = 600,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_traces(marker={'size': 6})
fig.update_layout(legend= {'itemsizing': 'constant'})

fig.show()
In [68]:
meta_plot = meta_all.copy()

fig = px.scatter(
    meta_plot.sort_values(by = 'gattung_color_order'),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'gattung_color',
    color_discrete_sequence = ['Black', '#6e7f80'] + px.colors.qualitative.Plotly[:5],
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'gattung_color' : 'Gattung'}
)

fig.update_traces(marker={'size': 6})
fig.update_layout(
    # width = 1000, height = 600,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_layout(legend= {'itemsizing': 'constant'})

fig.show()
In [69]:
meta_plot = meta_all.copy()

fig = px.scatter(
    meta_plot.sort_values(by='vergangenheitsdominant'),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'vergangenheitsdominant',
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : ''}
)

fig.update_traces(marker={'size': 6})
fig.update_layout(
    # width = 1000, height = 600,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_layout(legend= {'itemsizing': 'constant'})

fig.show()
In [70]:
meta_plot = meta_all.copy()

fig = px.scatter(
    meta_plot.sort_values(by = 'Militär/Krieg', ascending=False),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'Militär/Krieg',
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'Militär/Krieg': 'Stoffgebiet Militär/Krieg'}
)

fig.update_traces(marker={'size': 6})
fig.update_layout(
    # width = 1000, height = 600,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_layout(legend= {'itemsizing': 'constant'})

fig.show()
In [71]:
# meta_plot = meta_all.copy()
# 
# fig = px.scatter(
#     meta_plot,
#     x = 'umap_dim_1',
#     y = 'umap_dim_2',
#     color = 'sprechinstanz_zeitdominanz',
#     hover_data = ['author', 'title', 'gattung'],
#     labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 
#               'sprechinstanz_zeitdominanz' : 'Sprechinstanz und Zeitdominanz'}
# )
# 
# fig.update_traces(marker={'size': 6})
# fig.update_layout(
#     # width = 1000, height = 600,
#     legend=dict(font=dict(size=16)),
#     xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
#     yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
# )
# fig.update_layout(legend= {'itemsizing': 'constant'})
# 
# fig.show()

Balladen unter nicht vergangenheitsdominanten Texten¶

In [72]:
meta_all['militaer_positiv'] = [1 if x == 'behandelt und positiv bewertet' else 0 for x in meta_all['Militär/Krieg']]
meta_all['period'] = ['1850–1884' if 1850 <= x <= 1884 else '1885–1918' for x in meta_all['year']]
In [73]:
meta_test = meta_all.query("vergangenheitsdominant!='vergangenheitsdominant' and corpus=='anth'").copy()
In [74]:
results = pd.crosstab(meta_test['period'], meta_test['militaer_positiv'], margins=True)
results['0_rel'] = results[0]/results['All']
results['1_rel'] = results[1]/results['All']
results
Out[74]:
militaer_positiv 0 1 All 0_rel 1_rel
period
1850–1884 144 79 223 0.645740 0.354260
1885–1918 82 22 104 0.788462 0.211538
All 226 101 327 0.691131 0.308869
In [75]:
chi2_contingency(pd.crosstab(meta_test['period'], meta_test['militaer_positiv']), correction=False)
Out[75]:
Chi2ContingencyResult(statistic=6.767616255943135, pvalue=0.009282673896133627, dof=1, expected_freq=array([[154.12232416,  68.87767584],
       [ 71.87767584,  32.12232416]]))
In [76]:
get_phi(np.array(pd.crosstab(meta_test['period'], meta_test['militaer_positiv'])))
Out[76]:
0.14386130187509125

Vergleich: ohne Nicht-Vergangenheitsdominanz¶

In [77]:
meta_all.query("corpus=='anth'").groupby('period')['militaer_positiv'].mean()
Out[77]:
period
1850–1884    0.229569
1885–1918    0.208333
Name: militaer_positiv, dtype: float64
In [78]:
chi2_contingency(pd.crosstab(
    meta_all.query("corpus=='anth'")['period'], 
    meta_all.query("corpus=='anth'")['militaer_positiv']), 
    correction=False)[1]
Out[78]:
0.32921562346581856
In [79]:
get_phi(np.array(pd.crosstab(
    meta_all.query("corpus=='anth'")['period'], 
    meta_all.query("corpus=='anth'")['militaer_positiv'], 
)))
Out[79]:
0.022684448317960584